C#写的蜘蛛程序也叫小偷程序"蜘蛛"(Spider)是Internet上一种很有用的程序,搜索引擎利用蜘蛛程序将Web页面收集到数据库,企业利用蜘蛛程序监视竞争对手的网站并跟踪变动,个人用户用蜘蛛程序下载Web页面以便脱机使用,开发者利用蜘蛛程序扫描自己的Web检查无效的链接……对于不同的用户,蜘蛛程序有不同的用途。那么,蜘蛛程序到底是怎样工作的呢?
蜘蛛是一种半自动的程序,就象现实当中的蜘蛛在它的Web(蜘蛛网)上旅行一样,蜘蛛程序也按照类似的方式在Web链接织成的网上旅行。蜘蛛程序之所以是半自动的,是因为它总是需要一个初始链接(出发点),但此后的运行情况就要由它自己决定了,蜘蛛程序会扫描起始页面包含的链接,然后访问这些链接指向的页面,再分析和追踪那些页面包含的链接。从理论上看,最终蜘蛛程序会访问到Internet上的每一个页面,因为Internet上几乎每一个页面总是被其他或多或少的页面引用。
namespace Spider{/// <summary>/// Perform all of the work of a single thread for the spider./// This involves waiting for a URL to becomve available, download/// and then processing the page./// /// </summary>// 完成必须由单个工作线程执行的操作,包括// 等待可用的URL,下载和处理页面public class DocumentWorker{/// <summary>/// The base URI that is to be spidered./// </summary>// 要扫描的基础URIprivate Uri m_uri;/// <summary>/// The spider that this thread "works for"/// </summary>// private Spider m_spider;/// <summary>/// The thread that is being used./// </summary>private Thread m_thread;/// <summary>/// The thread number, used to identify this worker./// </summary>// 线程编号,用来标识当前的工作线程private int m_number;/// <summary>/// The name for default documents./// </summary>// 缺省文档的名字public const string IndexFile = "index.html";/// <summary>/// Constructor./// </summary>/// <param name="spider">The spider that owns this worker.</param>// 构造函数,参数表示拥有当前工作线程的蜘蛛程序public DocumentWorker(Spider spider){m_spider = spider;}/// <summary>/// This method will take a URI name, such ash /images/blank.gif/// and convert it into the name of a file for local storage./// If the directory structure to hold this file does not exist, it/// will be created by this method./// </summary>/// <param name="uri">The URI of the file about to be stored</param>/// <returns></returns>// 输入参数是一个URI名称,例如/images/blank.gif.// 把它转换成本地文件名称。如果尚未创建相应的目录// 结构,则创建之private string convertFilename(Uri uri){string result = m_spider.OutputPath;int index1;int index2;// add ending slash if neededif( result[result.Length-1]!='\\' )result = result "\\";// strip the query if neededString path = uri.PathAndQuery;int queryIndex = path.IndexOf("?");if( queryIndex!=-1 )path = path.Substring(0,queryIndex);// see if an ending / is missing from a directory onlyint lastSlash = path.LastIndexOf('/');int lastDot = path.LastIndexOf('.');if( path[path.Length-1]!='/' ){if(lastSlash>lastDot)path ="/" IndexFile;}// determine actual filenamelastSlash = path.LastIndexOf('/');string filename = "";if(lastSlash!=-1){filename=path.Substring(1 lastSlash);path = path.Substring(0,1 lastSlash);if(filename.Equals("") )filename=IndexFile;}// 必要时创建目录结构index1 = 1;do{index2 = path.IndexOf('/',index1);if(index2!=-1){String dirpart = path.Substring(index1,index2-index1);result =dirpart;result ="\\";Directory.CreateDirectory(result);index1 = index2 1;}} while(index2!=-1);// attach nameresult =filename;return result;}/// <summary>/// Save a binary file to disk./// </summary>/// <param name="response">The response used to save the file</param>// 将二进制文件保存到磁盘private void SaveBinaryFile(WebResponse response){byte []buffer = new byte[1024];if( m_spider.OutputPath==null )return;string filename = convertFilename( response.ResponseUri );Stream outStream = File.Create( filename );Stream inStream = response.GetResponseStream();int l;do{l = inStream.Read(buffer,0,buffer.Length);if(l>0)outStream.Write(buffer,0,l);}while(l>0);outStream.Close();inStream.Close();}/// <summary>/// Save a text file./// </summary>/// <param name="buffer">The text to save</param>// 保存文本文件private void SaveTextFile(string buffer){if( m_spider.OutputPath==null )return;string filename = convertFilename( m_uri );StreamWriter outStream = new StreamWriter( filename );outStream.Write(buffer);outStream.Close();}/// <summary>/// Download a page/// </summary>/// <returns>The data downloaded from the page</returns>// 下载一个页面private string GetPage(){WebResponse response = null;Stream stream = null;StreamReader reader = null;try{HttpWebRequest request = (HttpWebRequest)WebRequest.Create(m_uri);response = request.GetResponse();stream = response.GetResponseStream();if( !response.ContentType.ToLower().StartsWith("text/") ){SaveBinaryFile(response);return null;}string buffer = "",line;reader = new StreamReader(stream);while( (line = reader.ReadLine())!=null ){buffer =line "\r\n";}SaveTextFile(buffer);return buffer;}catch(WebException e){System.Console.WriteLine("下载失败,错误:" e);return null;}catch(IOException e){System.Console.WriteLine("下载失败,错误:" e);return null;}finally{if( reader!=null ) reader.Close();if( stream!=null ) stream.Close();if( response!=null ) response.Close();}}/// <summary>/// Process each link encountered. The link will be recorded/// for later spidering if it is an http or https docuent, /// has not been visited before(determined by spider class),/// and is in the same host as the original base URL./// </summary>/// <param name="link">The URL to process</param>private void ProcessLink(string link){Uri url;// fully expand this URL if it was a relative linktry{url = new Uri(m_uri,link,false);}catch(UriFormatException e){System.Console.WriteLine( "Invalid URI:" link " Error:" e.Message);return;}if(!url.Scheme.ToLower().Equals("http") &&!url.Scheme.ToLower().Equals("https") )return;// comment out this line if you would like to spider// the whole Internet (yeah right, but it will try)if( !url.Host.ToLower().Equals( m_uri.Host.ToLower() ) )return;//System.Console.WriteLine( "Queue:" url );m_spider.addURI( url );}/// <summary>/// Process a URL/// </summary>/// <param name="page">the URL to process</param>private void ProcessPage(string page){ParseHTML parse = new ParseHTML();parse.Source = page;while(!parse.Eof()){char ch = parse.Parse();if(ch==0){Attribute a = parse.GetTag()["HREF"];if( a!=null )ProcessLink(a.Value);a = parse.GetTag()["SRC"];if( a!=null )ProcessLink(a.Value);}}}/// <summary>/// This method is the main loop for the spider threads./// This method will wait for URL's to become available, /// and then process them. /// </summary>public void Process(){while(!m_spider.Quit ){m_uri = m_spider.ObtainWork();m_spider.SpiderDone.WorkerBegin();System.Console.WriteLine("Download(" this.Number "):" m_uri);string page = GetPage();if(page!=null)ProcessPage(page);m_spider.SpiderDone.WorkerEnd();}}/// <summary>/// Start the thread./// </summary>public void start(){ThreadStart ts = new ThreadStart( this.Process );m_thread = new Thread(ts);m_thread.Start();}/// <summary>/// The thread number. Used only to identify this thread./// </summary>public int Number {get{return m_number;}set{m_number = value;}}}}
评论